*********************************************
*     From address_geocoding.dta to         *
*.          scotts_address.csv              *
*           ----------  			        *
* This file creates geocoding quality       *
* indicators                                *
*********************************************


clear all
set more off

// Working directories
cd "$workpath"

// Locals we need for looping across years and IO directories
local years "2001 2003 2005 2007 2009 2011 2013 2017"



use "$privatedata/address_geo2001_2019.dta", clear

// Traitement des adresses pour extraires les codes postaux

replace scotts_address = trim(upper(scotts_address))
replace google_address1 = trim(google_address1)
replace google_address2 = trim(google_address2)
replace dmti_address = trim(dmti_address)

replace scotts_address = subinstr(scotts_address, "  ", " ", .)
replace scotts_address = subinstr(scotts_address, " ,", ", ", .)
replace google_address1 = subinstr(google_address1, "  ", " ", .)
replace google_address1 = subinstr(google_address1, " ,", ", ", .)
replace google_address2 = subinstr(google_address2, "  ", " ", .)
replace google_address2 = subinstr(google_address2, " ,", ", ", .)
replace dmti_address = subinstr(dmti_address, "  ", " ", .)
replace dmti_address = subinstr(dmti_address, " ,", ", ", .)

// Extraction des codes postaux
gen pc_g1 = substr(google_address1, -15, 7)
gen pc_g2 = substr(google_address2, -15, 7)
gen pc_s = substr(scotts_address,-14 , 7)
gen pc_d = substr(dmti_address, -13, 6)

// Correction et harmonisation des codes postaux
replace pc_g2 = "" if strpos(pc_g2, ",") != 0
replace pc_g1 = "" if strpos(pc_g1, ",") != 0
replace pc_d  = "" if strpos(pc_d, ",") != 0
replace pc_s  = "S7P0B8" if adressid == 223050
replace pc_s  = "N1H5K3" if adressid == 261259
replace pc_g2 = "R3G3L7" if adressid == 15988
replace pc_g2 = "M6J3A4" if adressid == 215912

replace pc_g2 = subinstr(pc_g2, " ", "", .)
replace pc_g1 = subinstr(pc_g1, " ", "", .)
replace pc_d = subinstr(pc_d, " ", "", .)
replace pc_s = subinstr(pc_s, " ", "", .)


// Check if postal codes are valid with 6 digits
gen taille_s  = length(pc_s)
gen taille_g2 = length(pc_g2)
gen taille_g1 = length(pc_g1)
gen taille_d  = length(pc_d)

// Suppression des cas pour lesquels la taille est differente de 6 (cas imprecis ou qui sont en dehors du Canada)
replace pc_g2 = "" if taille_g2 != 6
replace pc_g1 = "" if taille_g1 != 6
replace pc_d = substr(dmti_address, 1, 6) if taille_d!=6   //Remplcaement des cas où l'adresse dmti n'est pas écrite de la même façon que les autres
replace pc_s = "" if taille_s != 6

// Recalcul du nombre ed caracteres des codes postaux
replace taille_s = length(pc_s)
replace taille_g2 = length(pc_g2)
replace taille_g1 = length(pc_g1)
replace taille_d = length(pc_d)

// Suppression des zero
replace taille_g2 = . if taille_g2 == 0
replace taille_d = . if taille_d == 0
replace taille_s = . if taille_s == 0
replace taille_g1 = . if taille_g1 == 0

// Verification de l'harmonie des codes postaux : min = max = mean = 6
su taille_* 
capture drop taille_*

// Extraction des régions des Tri postaux 
gen rti_g2 = substr(pc_g2, 1, 3)
gen rti_g1 = substr(pc_g1, 1, 3)
gen rti_d = substr(pc_d, 1, 3)
gen rti_s = substr(pc_s, 1, 3)

// Ajustement de la variable accuracy du 2eme geocodage
// Replace acc with ROOFTOP if we have establishments, since we want to keep them
gen acc_g2adj = acc_g2
replace acc_g2adj = "ROOFTOP" if strpos(type_2, "establishment") != 0

replace lat_g1 = . if acc_g1 == "APPROXIMATE" | acc_g1 == "GEOMETRIC_CENTER"
replace lat_g2 = . if acc_g2adj == "APPROXIMATE"| acc_g2adj == "GEOMETRIC_CENTER"
replace lon_g1 = . if acc_g1 == "APPROXIMATE" | acc_g1 == "GEOMETRIC_CENTER"
replace lon_g2 = . if acc_g2adj == "APPROXIMATE"| acc_g2adj == "GEOMETRIC_CENTER"

replace pc_g1 = "" if acc_g1 == "APPROXIMATE" | acc_g1 == "GEOMETRIC_CENTER"
replace pc_g2 = "" if acc_g2adj == "APPROXIMATE"| acc_g2adj == "GEOMETRIC_CENTER"

replace rti_g1 = "" if acc_g1 == "APPROXIMATE" | acc_g1 == "GEOMETRIC_CENTER"
replace rti_g2 = "" if acc_g2adj == "APPROXIMATE"| acc_g2adj == "GEOMETRIC_CENTER"

replace acc_g1 = "" if lat_g1 == .
replace acc_g2 = "" if lat_g2 == .
replace acc_g2adj = "" if lat_g2 == .

// Identification des paires de codes postaux cohérents
gen sg2 = (pc_s == pc_g2)
gen sd = (pc_d == pc_s)
gen dg2 = (pc_g2 == pc_d)
gen g2g1 = (pc_g2 == pc_g1)
gen sg1 = (pc_s == pc_g1)
gen g1d = (pc_d == pc_g1)

// Identification des paires de régions de tri cohérentes
gen rti_sg2 = (rti_s == rti_g2)
gen rti_sg1 = (rti_s == rti_g1)
gen rti_sd = (rti_s == rti_d)
gen rti_g2g1 = (rti_g2 == rti_g1)
gen rti_g2d = (rti_g2 == rti_d)
gen rti_dg1 = (rti_d == rti_g1)


// Differenciation des valeurs manquantes des codes postaux
replace pc_g2 = "mg2" if pc_g2 == ""
replace pc_g1 = "mg1" if pc_g1 == ""
replace pc_d = "md" if pc_d == ""
replace pc_s = "ms" if pc_s == ""

// Differenciation des valeurs manquantes des régions de tri
replace rti_g2 = "mg2" if rti_g2 == ""
replace rti_g1 = "mg1" if rti_g1 == ""
replace rti_d = "md" if rti_d == ""
replace rti_s = "ms" if rti_s == ""

// Calcul des distances entre les paires de points
geodist lat_s lon_s lat_g2 lon_g2, generate(dist_sg2)
geodist lat_s lon_s lat_d lon_d, generate(dist_sd)
geodist lat_s lon_s lat_g1 lon_g1, generate(dist_sg1)
geodist lat_g2 lon_g2 lat_d lon_d, generate(dist_g2d)
geodist lat_g2 lon_g2 lat_g1 lon_g1, generate(dist_g2g1)
geodist lat_g1 lon_g1 lat_d lon_d, generate(dist_g1d)

// Calcul des distances minimale pour toutes les combinaisons possibles avec 4 et 3 adresses
gen min_dist_sdg1g2 = min(dist_sg2, dist_sd, dist_sg1, dist_g2d, dist_g2g1, dist_g1d)
gen min_dist_sdg1 = min(dist_sd, dist_sg1, dist_g1d)
gen min_dist_sdg2 = min(dist_sg2, dist_sd, dist_g2d)
gen min_dist_sg1g2 = min(dist_sg2, dist_sg1, dist_g2g1)
gen min_dist_dg1g2 = min(dist_g2d, dist_g2g1, dist_g1d)

// Postal codes matching variable
gen     match = 1 if sg1==1 & sd==1 & g1d==1 & g2g1==1 & sg2==1 & dg2==1
replace match = 2 if sg1==1 & sd==0 & g1d==0 & g2g1==1 & sg2==1 & dg2==0
replace match = 3 if sg1==1 & sd==1 & g1d==1 & g2g1==0 & sg2==0 & dg2==0
replace match = 4 if sg1==0 & sd==1 & g1d==0 & g2g1==0 & sg2==1 & dg2==1

//replace match = 5 if (sg1==0 & sd==1 & g1d==0 & g2g1==0 & sg2==0 & dg2==0) | (sg1==0 & sd==1 & g1d==0 & g2g1==1 & sg2==0 & dg2==0)
replace match = 5 if (sg1==0 & sd==1 & g1d==0 & sg2==0 & dg2==0)

//replace match = 6 if (sg1==1 & sd==0 & g1d==0 & g2g1==0 & sg2==0 & dg2==0) | (sg1==1 & sd==0 & g1d==0 & g2g1==0 & sg2==0 & dg2==1)
replace match = 6 if (sg1==1 & sd==0 & g1d==0 & g2g1==0 & sg2==0)

//replace match = 7 if (sg1==0 & sd==0 & g1d==0 & g2g1==0 & sg2==1 & dg2==0) | (sg1==0 & sd==0 & g1d==1 & g2g1==0 & sg2==1 & dg2==0)
replace match = 7 if (sg1==0 & sd==0 & g2g1==0 & sg2==1 & dg2==0)

replace match =  8 if sg1==0 & sd==0 & g1d==1 & g2g1==1 & sg2==0 & dg2==1
replace match =  9 if sg1==0 & sd==0 & g1d==1 & g2g1==0 & sg2==0 & dg2==0
replace match = 10 if sg1==0 & sd==0 & g1d==0 & g2g1==0 & sg2==0 & dg2==1
replace match = 11 if sg1==0 & sd==0 & g1d==0 & g2g1==1 & sg2==0 & dg2==0
replace match = 12 if sg1==0 & sd==0 & g1d==0 & g2g1==0 & sg2==0 & dg2==0

label define match_lbl 1  "sdg2g1 - All have the same postal code", modify
label define match_lbl 2  "sg2g1 - Only Scotts, Google1&2 have the same postal code", modify
label define match_lbl 3  "sdg1 - Only Scotts, DMTI & Google1 have the same postal code", modify
label define match_lbl 4  "sdg2 - Only Scotts, DMTI & Google2 have the same postal code", modify
label define match_lbl 5  "sd - Only Scotts and DMTI have the same postal code", modify
label define match_lbl 6  "sg1 - Only Scotts and Google1 have the same postal code", modify
label define match_lbl 7  "sg2 - Only Scotts and Google2 have the same postal code", modify
label define match_lbl 8  "dg2g1 - Only DMTI & Google1&2 have the same postal code", modify
label define match_lbl 9  "dg1 - Only DMT and Google1 have the same postal code", modify
label define match_lbl 10 "dg2 - Only DMT and Google2 have the same postal code", modify
label define match_lbl 11 "g2g1 - Only Google1&2 have the same postal code", modify
label define match_lbl 12 "All the postal codes are different", modify
label values match match_lbl


// Generating longitude and latitude variables
tostring lat* lon*, replace force
replace lat_d  = "" if lat_d  == "."
replace lon_d  = "" if lon_d  == "."
replace lat_g1 = "" if lat_g1 == "."
replace lon_g1 = "" if lon_g1 == "."
replace lat_g2 = "" if lat_g2 == "."
replace lon_g2 = "" if lon_g2 == "."


// OUR NEW PROCEDURE TO ASSIGN LAT-LON

// We take Google2 (rooftop-quality geocoding based on address + company name as the baseline)
gen lon = lon_g2 if acc_g2adj == "ROOFTOP"
gen lat = lat_g2 if acc_g2adj == "ROOFTOP"
// In case we don't have this, use Google1 (rooftop-quality geocoding based on address only)
replace lon = lon_g1 if lon == "" & acc_g1 == "ROOFTOP"
replace lat = lat_g1 if lat == "" & acc_g1 == "ROOFTOP"
// In case we don't have this either, use geocoding by address based on the DMTI address file
replace lon = lon_d  if lon == ""
replace lat = lat_d  if lat == ""

// Refinement 1: If Scott's and Google2 have different postal codes, but Scott's and Google1 have the same postal codes,
// and if Google1 is of rooftop quality, we take this
replace lon = lon_g1 if (sg2 == 0 & sg1 == 1 & acc_g1 == "ROOFTOP" & lon_g1 != "")
replace lat = lat_g1 if (sg2 == 0 & sg1 == 1 & acc_g1 == "ROOFTOP" & lat_g1 != "")

// Refinement 2: If Google2 and Google1 have different postal codes than Scott's, and DMTI has the same code, we take DMTI
replace lon = lon_d if (sd == 1 & sg1 == 0 & sg2 == 0 & lon_d != "")
replace lat = lat_d if (sd == 1 & sg1 == 0 & sg2 == 0 & lat_d != "")

// Refinement 3: If Google2 and Google1 and DMTI have different postal codes than Scott's, but DMTI and Google1 are identical
// we take Google1 (recall Google2 is the already selected default)
replace lon = lon_g1 if sg1==0 & sd==0 & sg2==0 & g1d==1 & g2g1==0 & dg2==0 & lon_g1 != "" & acc_g1 == "ROOFTOP"
replace lat = lat_g1 if sg1==0 & sd==0 & sg2==0 & g1d==1 & g2g1==0 & dg2==0 & lat_g1 != "" & acc_g1 == "ROOFTOP"


// Refinement 4: If all four postal codes are different (or missing), we take Google 2
// [NOTE] This does not seem to matter, DROP IT?
replace lon = lon_g2 if sg1==0 & sd==0 & g1d==0 & g2g1==0 & sg2==0 & dg2==0 & lon_g2 != "" & acc_g2adj == "ROOFTOP"
replace lon = lon_g1 if sg1==0 & sd==0 & g1d==0 & g2g1==0 & sg2==0 & dg2==0 & lon_g1 != "" & lon_g1 == "" & acc_g1 == "ROOFTOP"
replace lon = lon_d if sg1==0 & sd==0 & g1d==0 & g2g1==0 & sg2==0 & dg2==0 & lon_g1 == "" & lon_g2 == "" & lon_d != ""

replace lat = lat_g2 if sg1==0 & sd==0 & g1d==0 & g2g1==0 & sg2==0 & dg2==0 & lat_g2 != "" & acc_g2adj == "ROOFTOP"
replace lat = lat_g1 if sg1==0 & sd==0 & g1d==0 & g2g1==0 & sg2==0 & dg2==0 & lat_g1 != "" & lat_g1 == "" & acc_g1 == "ROOFTOP"
replace lat = lat_d if sg1==0 & sd==0 & g1d==0 & g2g1==0 & sg2==0 & dg2==0 & lat_g1 == "" & lat_g2 == "" & lat_d != ""

// Refinement 5: For those where we don't have rooftop at all, let's keep lower-quality geocoding (range interpolated for now)
// hierarchically from Google2 first and Google1 then and finally DMTI
replace lon = lon_g2 if lon == "" & lon_g2 != "" 
replace lat = lat_g2 if lat == "" & lat_g2 != ""
replace lon = lon_g1 if lon == "" & lon_g1 != ""
replace lat = lat_g1 if lat == "" & lat_g1 != ""
replace lon = lon_d  if lon == "" & lon_d  != ""
replace lat = lat_d  if lat == "" & lat_d  != ""


// Generate information on the origin of the coordinates based on equality between lon/lat and lon_* / lat_*
gen geocodedby = "google2" if lon == lon_g2 & lat == lat_g2
replace geocodedby = "google1" if lon == lon_g1 & lat == lat_g1 & geocodedby == ""
replace geocodedby = "dmti" if lon == lon_d & lat == lat_d & geocodedby == ""
replace geocodedby = "" if lon == ""


// Generating the geocoding quality variable
scalar alpha = 0.25

gen quality = .

// Highest quality: At least one of the geocoded postal codes matches with that provided by Scott's, and the distances are below 250m
replace quality = 1 if (match == 1 & min_dist_sdg1g2 <= alpha) | (match == 2 & min_dist_sg1g2 <= alpha) | (match == 3 & min_dist_sdg1 <= alpha) | (match == 4 & min_dist_sdg2 <= alpha) | (match == 5 & dist_sd <= alpha) | (match == 6 & dist_sg1 <= alpha) | (match == 7 & dist_sg2 <= alpha) | (match == 8 & rti_s == rti_d & rti_d == rti_g1 & rti_g1 == rti_g2 & min_dist_dg1g2 <= alpha)					
// Medium quality: No postal code matches with that provided by Scott's, but the Scott's FSA matches and at least two geocodes are less than 250m
replace quality = 2 if (match == 9 & rti_s == rti_g1 & dist_g1d <= alpha) | (match == 10 & rti_s == rti_g2 & dist_g2d <= alpha) | (match == 11 & rti_s == rti_g2 & dist_g2g1 <= alpha) | (match == 12 & rti_s == rti_g1 & rti_s == rti_g2 & min_dist_sg1g2 < alpha)
// Lowest quality: Suspicious
replace quality = 3 if quality == . & lon != ""

// Accuracy 
// Google2 was used to geocode
gen accuracy = acc_g2adj if lon == lon_g2 & lat == lat_g2
// Google 1 was used to geocode
replace accuracy = acc_g1 if lon == lon_g1 & lat==lat_g1 & accuracy == ""
// DMTI was used
replace accuracy = acc_d if lon == lon_d & lat == lat_d & accuracy == ""
// The rest
replace accuracy = "" if lon == ""

destring lat* lon*, replace force

drop if lon == .

compress

save "$outpath/address_geo2001_2017.dta", replace
